Scatter Plots and Smoothing Curves
load("chapter_3_Part_I_0912.rda")
ls()
## [1] "A" "age.ecdf" "cowmap" "custdata"
## [5] "custdata2" "custdata3" "custdata4" "dhus"
## [9] "dpus" "dtest" "dtrain" "g.ecdf"
## [13] "g1" "g2" "g3" "g4"
## [17] "g5" "hhdata" "medianincome" "o.sor"
## [21] "p" "p1" "p2" "p3"
## [25] "p4" "poly.age" "poly.x" "poly.y"
## [29] "psub" "result" "schlmap" "sor.df"
## [33] "sor.df.2" "sor.df.o" "sor.tbl" "sor.tbl.2"
## [37] "sub" "tbl.hm" "tbl.hm.df" "tbl.mh"
## [41] "tbl.mh.df" "theme.kr" "x" "y"
custdata2 <- subset(custdata, (custdata$age > 0 & custdata$age < 100 & custdata$income > 0))
options(digits=2)
cor(custdata2$age, custdata2$income)
## [1] -0.022
- Scatter Plot. 화살표를 넣기 위하여
grid 패키지 등록
library(grid)
(g1 <- ggplot(custdata2, aes(x=age, y=income)) + geom_point())

(g2 <- g1 + ylim(0, 250000))
## Warning: Removed 25 rows containing missing values (geom_point).

(g3 <- g2 + annotate("segment", x=20, xend=50, y=225000, yend=225000, colour="blue", size=0.5, arrow=arrow(ends="both", length=unit(0.2, "cm"))) +
annotate("text", x=35, y=235000, label="나이와 함께 소득도 증가", family="HCR Dotum LVT", size=5, colour="red"))
## Warning: Removed 25 rows containing missing values (geom_point).

(g4 <- g3 + annotate("segment", x=60, xend=90, y=190000, yend=190000, colour="blue", size=0.5, arrow=arrow(ends="both", length=unit(0.2, "cm"))) +
annotate("text", x=75, y=200000, label="나이는 들고 소득은 감소", family="HCR Dotum LVT", size=5, colour="red"))
## Warning: Removed 25 rows containing missing values (geom_point).

g1 + stat_smooth(method="lm") + ylim(0, 250000)
## Warning: Removed 25 rows containing missing values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).

g1 + stat_smooth(method="loess") + ylim(0, 250000)
## Warning: Removed 25 rows containing missing values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).

g1 + geom_smooth() + ylim(0, 250000)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 25 rows containing missing values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).

summary(custdata2$health.ins)
## Mode FALSE TRUE NA's
## logical 119 791 0
(h1 <- ggplot(custdata2, aes(x=age, y=as.numeric(health.ins))) +
geom_point(position=position_jitter(w=0.05, h=0.05)))

(h2 <- h1 + geom_smooth())
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

- glm의 하나인 logistic regression으로 적합시키면,
(h3 <- h1 + stat_smooth(method=glm, family=binomial))

Bar Charts for Two Categorical Variables
ggplot(custdata2, aes(x=marital.stat, fill=health.ins)) + geom_bar()

table로 정리하고, data frame으로 만들어 작업하는데 있어서 한 가지 주의사항은 다음과 같이 with()를 사용하여 table로 만들어야 변수명을 그대로 사용할 수 있다는 점임.
options(width=132)
(tbl.mh <- with(custdata2, table(marital.stat, health.ins)))
## health.ins
## marital.stat FALSE TRUE
## Divorced/Separated 20 126
## Married 43 426
## Never Married 54 149
## Widowed 2 90
addmargins(tbl.mh)
## health.ins
## marital.stat FALSE TRUE Sum
## Divorced/Separated 20 126 146
## Married 43 426 469
## Never Married 54 149 203
## Widowed 2 90 92
## Sum 119 791 910
(tbl.mh.df <- data.frame(tbl.mh))
## marital.stat health.ins Freq
## 1 Divorced/Separated FALSE 20
## 2 Married FALSE 43
## 3 Never Married FALSE 54
## 4 Widowed FALSE 2
## 5 Divorced/Separated TRUE 126
## 6 Married TRUE 426
## 7 Never Married TRUE 149
## 8 Widowed TRUE 90
(g.mh <- ggplot(tbl.mh.df, aes(x=marital.stat, y=Freq, fill=health.ins)) + geom_bar(stat="identity"))

g.mh + annotate("text", x=3.2, y=450, label="대부분의 고객은 결혼", family="HCR Dotum LVT", size=4) +
annotate("text", x=3, y=270, label="결혼한 적\n 없는 고객은\n보험도\n들지 않음", family="HCR Dotum LVT", size=4) +
annotate("text", x=4, y=150, label="홀로 된 고객은\n 드물지만,\n보험은\n 대부분 가입", family="HCR Dotum LVT", size=4)

ggplot(custdata2, aes(x=marital.stat, fill=health.ins)) + geom_bar(position="dodge")

ggplot(custdata2, aes(x=marital.stat, fill=health.ins)) + geom_bar(position="fill")

ggplot(custdata2, aes(x=marital.stat, fill=health.ins)) + geom_bar(position="fill") +
geom_point(aes(y=-0.05), size=0.75, alpha=0.3, position=position_jitter(h=0.01))

(g.hm <- ggplot(custdata2, aes(x=housing.type, fill=marital.stat)) + geom_bar())

g.hm + coord_flip()

(tbl.hm <- with(custdata2, table(housing.type, marital.stat, useNA="ifany")))
## marital.stat
## housing.type Divorced/Separated Married Never Married Widowed
## Homeowner free and clear 17 84 14 36
## Homeowner with mortgage/loan 54 266 46 24
## Occupied with no rent 1 5 3 2
## Rented 70 108 125 24
## <NA> 4 6 15 6
addmargins(tbl.hm)
## marital.stat
## housing.type Divorced/Separated Married Never Married Widowed Sum
## Homeowner free and clear 17 84 14 36 151
## Homeowner with mortgage/loan 54 266 46 24 390
## Occupied with no rent 1 5 3 2 11
## Rented 70 108 125 24 327
## <NA> 4 6 15 6 31
## Sum 146 469 203 92 910
(tbl.hm.df <- data.frame(tbl.hm))
## housing.type marital.stat Freq
## 1 Homeowner free and clear Divorced/Separated 17
## 2 Homeowner with mortgage/loan Divorced/Separated 54
## 3 Occupied with no rent Divorced/Separated 1
## 4 Rented Divorced/Separated 70
## 5 <NA> Divorced/Separated 4
## 6 Homeowner free and clear Married 84
## 7 Homeowner with mortgage/loan Married 266
## 8 Occupied with no rent Married 5
## 9 Rented Married 108
## 10 <NA> Married 6
## 11 Homeowner free and clear Never Married 14
## 12 Homeowner with mortgage/loan Never Married 46
## 13 Occupied with no rent Never Married 3
## 14 Rented Never Married 125
## 15 <NA> Never Married 15
## 16 Homeowner free and clear Widowed 36
## 17 Homeowner with mortgage/loan Widowed 24
## 18 Occupied with no rent Widowed 2
## 19 Rented Widowed 24
## 20 <NA> Widowed 6
ggplot(tbl.hm.df, aes(x=reorder(housing.type, Freq), y=Freq, fill=marital.stat)) +
geom_bar(stat="identity") +
coord_flip() +
xlab("Housing Type") + ylab("Count")

ggplot(custdata2, aes(x=marital.stat)) + geom_bar(position="dodge") +
facet_wrap(~housing.type, scales="free_y")

ggplot(custdata2, aes(x=marital.stat)) + geom_bar(position="dodge", alpha=0.5) +
facet_wrap(~housing.type, scales="free_y") +
theme(axis.text.x = element_text(angle=45, hjust=1))

ggplot(custdata2, aes(x=marital.stat)) + geom_bar(position="dodge", alpha=0.5) +
facet_wrap(~housing.type, scales="free_y", ncol=2) +
theme(axis.text.x = element_text(angle=45, hjust=1))

ggplot(custdata2, aes(x=housing.type)) + geom_bar(position="dodge", alpha=0.5) +
facet_wrap(~marital.stat, scales="free_y", ncol=2) +
theme(axis.text.x = element_text(angle=90, hjust=1))

mosaicplot(tbl.hm)

mosaicplot(tbl.hm, main="Marital Status and Housing Type", xlab="Housing Type", ylab="Marital Status", las=2)

mosaicplot(tbl.hm, main="Marital Status and Housing Type", xlab="Housing Type", ylab="Marital Status", las=2, color=rainbow(4))
